#Image Captioning
import os # handling the files
import pickle # storing numpy features
import numpy as np
from tqdm.notebook import tqdm # how much data is process till now
from tensorflow.keras.applications.vgg16 import VGG16 , preprocess_input # extract features from image data.
from tensorflow.keras.preprocessing.image import load_img , img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input , Dense , LSTM , Embedding , Dropout , add
BASE_DIR = 'E:\Masters\Image_captioning'
WORKING_DIR = 'E:\Masters\Image_captioning'
# Load vgg16 Model
model = VGG16()
# restructure model
model = Model(inputs = model.inputs , outputs = model.layers[-2].output)
# Summerize
print(model.summary())
Model: "model"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, 224, 224, 3)] 0
block1_conv1 (Conv2D) (None, 224, 224, 64) 1792
block1_conv2 (Conv2D) (None, 224, 224, 64) 36928
block1_pool (MaxPooling2D) (None, 112, 112, 64) 0
block2_conv1 (Conv2D) (None, 112, 112, 128) 73856
block2_conv2 (Conv2D) (None, 112, 112, 128) 147584
block2_pool (MaxPooling2D) (None, 56, 56, 128) 0
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, 224, 224, 3)] 0
block1_conv1 (Conv2D) (None, 224, 224, 64) 1792
block1_conv2 (Conv2D) (None, 224, 224, 64) 36928
block1_pool (MaxPooling2D) (None, 112, 112, 64) 0
block2_conv1 (Conv2D) (None, 112, 112, 128) 73856
block2_conv2 (Conv2D) (None, 112, 112, 128) 147584
block2_pool (MaxPooling2D) (None, 56, 56, 128) 0
block3_conv1 (Conv2D) (None, 56, 56, 256) 295168
block3_conv2 (Conv2D) (None, 56, 56, 256) 590080
block3_conv3 (Conv2D) (None, 56, 56, 256) 590080
block3_pool (MaxPooling2D) (None, 28, 28, 256) 0
block4_conv1 (Conv2D) (None, 28, 28, 512) 1180160
block4_conv2 (Conv2D) (None, 28, 28, 512) 2359808
block4_conv3 (Conv2D) (None, 28, 28, 512) 2359808
block4_pool (MaxPooling2D) (None, 14, 14, 512) 0
block5_conv1 (Conv2D) (None, 14, 14, 512) 2359808
block5_conv2 (Conv2D) (None, 14, 14, 512) 2359808
block5_conv3 (Conv2D) (None, 14, 14, 512) 2359808
block5_pool (MaxPooling2D) (None, 7, 7, 512) 0
flatten (Flatten) (None, 25088) 0
fc1 (Dense) (None, 4096) 102764544
fc2 (Dense) (None, 4096) 16781312
=================================================================
Total params: 134,260,544
Trainable params: 134,260,544
Non-trainable params: 0
_________________________________________________________________
None
# extract features from image
features = {}
directory = os.path.join(BASE_DIR, 'Images')
for img_name in tqdm(os.listdir(directory)):
# load the image from file
img_path = directory + '/' + img_name
image = load_img(img_path, target_size=(224, 224))
# convert image pixels to numpy array
image = img_to_array(image)
# reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
# preprocess image for vgg
image = preprocess_input(image)
# extract features
feature = model.predict(image, verbose=0)
# get image ID
image_id = img_name.split('.')[0]
# store feature
features[image_id] = feature
0%| | 0/8092 [00:00<?, ?it/s]
# store features in pickle
pickle.dump(features, open(os.path.join(WORKING_DIR, 'features.pkl'), 'wb'))
# load features from pickle
with open(os.path.join(WORKING_DIR, 'features.pkl'), 'rb') as f:
features = pickle.load(f)
with open(os.path.join(BASE_DIR, 'captions.txt'), 'r') as f:
next(f)
captions_doc = f.read()
# create mapping of image to captions
mapping = {}
# process lines
for line in tqdm(captions_doc.split('\n')):
# split the line by comma(,)
tokens = line.split(',')
if len(line) < 2:
continue
image_id, caption = tokens[0], tokens[1:]
# remove extension from image ID
image_id = image_id.split('.')[0]
# convert caption list to string
caption = " ".join(caption)
# create list if needed
if image_id not in mapping:
mapping[image_id] = []
# store the caption
mapping[image_id].append(caption)
0%| | 0/40456 [00:00<?, ?it/s]
len(mapping)
8091
Preprocess Text Data
def clean(mapping):
for key, captions in mapping.items():
for i in range(len(captions)):
# take one caption at a time
caption = captions[i]
# preprocessing steps
# convert to lowercase
caption = caption.lower()
# delete digits, special chars, etc.,
caption = caption.replace('[^A-Za-z]', '')
# delete additional spaces
caption = caption.replace('\s+', ' ')
# add start and end tags to the caption
caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
captions[i] = caption
mapping['1000268201_693b08cb0e']
['A child in a pink dress is climbing up a set of stairs in an entry way .', 'A girl going into a wooden building .', 'A little girl climbing into a wooden playhouse .', 'A little girl climbing the stairs to her playhouse .', 'A little girl in a pink dress going into a wooden cabin .']
It has some bleeding puntucation so we are cleaning the data by using the clean funtion
clean(mapping)
mapping['1000268201_693b08cb0e']
['startseq child in pink dress is climbing up set of stairs in an entry way endseq', 'startseq girl going into wooden building endseq', 'startseq little girl climbing into wooden playhouse endseq', 'startseq little girl climbing the stairs to her playhouse endseq', 'startseq little girl in pink dress going into wooden cabin endseq']
Here we can see the output without the punctuation
Now We are storing the captions in the list from dict
all_captions = []
for key in mapping:
for caption in mapping[key]:
all_captions.append(caption)
len(all_captions)
40455
#printing the 10 captions
all_captions[:10]
['startseq child in pink dress is climbing up set of stairs in an entry way endseq', 'startseq girl going into wooden building endseq', 'startseq little girl climbing into wooden playhouse endseq', 'startseq little girl climbing the stairs to her playhouse endseq', 'startseq little girl in pink dress going into wooden cabin endseq', 'startseq black dog and spotted dog are fighting endseq', 'startseq black dog and tri-colored dog playing with each other on the road endseq', 'startseq black dog and white dog with brown spots are staring at each other in the street endseq', 'startseq two dogs of different breeds looking at each other on the road endseq', 'startseq two dogs on pavement moving toward each other endseq']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) +1
print(vocab_size)
8485
# get maximum length of the caption available
max_length = max(len(caption.split()) for caption in all_captions)
max_length
35
Train and test split
image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.80)
train = image_ids[:split]
test = image_ids[split:]
# create data generator to get data in batch (avoids session crash)
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
# loop over images
X1, X2, y = list(), list(), list()
n = 0
while 1:
for key in data_keys:
n += 1
captions = mapping[key]
# process each caption
for caption in captions:
# encode the sequence
seq = tokenizer.texts_to_sequences([caption])[0]
# split the sequence into X, y pairs
for i in range(1, len(seq)):
# split into input and output pairs
in_seq, out_seq = seq[:i], seq[i]
# pad input sequence
in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
# encode output sequence
out_seq = to_categorical([out_seq],num_classes=vocab_size)[0]
# store the sequences
X1.append(features[key][0])
X2.append(in_seq)
y.append(out_seq)
if n == batch_size:
X1, X2, y = np.array(X1), np.array(X2), np.array(y)
yield [X1, X2], y
X1, X2, y = list(), list(), list()
n = 0
epochs = 20
batch_size = 32
steps = len(train) // batch_size
# encoder model
# image feature layers
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)
# sequence feature layers
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)
# decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')
# plot the model
plot_model(model, show_shapes=True)
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.
# train the model
epochs = 20
batch_size = 32
steps = len(train) // batch_size
for i in range(epochs):
# create data generator
generator = data_generator(train, mapping, features, tokenizer, max_length, vocab_size, batch_size)
# fit for one epoch
model.fit(generator, epochs=1, steps_per_epoch=steps, verbose=1)
202/202 [==============================] - 27s 119ms/step - loss: 5.3098 202/202 [==============================] - 26s 131ms/step - loss: 4.0657 202/202 [==============================] - 29s 142ms/step - loss: 3.6080 202/202 [==============================] - 32s 160ms/step - loss: 3.3343 202/202 [==============================] - 53s 265ms/step - loss: 3.1329 202/202 [==============================] - 56s 276ms/step - loss: 2.9824 202/202 [==============================] - 58s 289ms/step - loss: 2.8672 202/202 [==============================] - 59s 290ms/step - loss: 2.7696 202/202 [==============================] - 49s 242ms/step - loss: 2.6855 202/202 [==============================] - 63s 313ms/step - loss: 2.6112 202/202 [==============================] - 52s 257ms/step - loss: 2.5450 202/202 [==============================] - 60s 298ms/step - loss: 2.4938 202/202 [==============================] - 61s 302ms/step - loss: 2.4457 202/202 [==============================] - 39s 192ms/step - loss: 2.3967 202/202 [==============================] - 55s 270ms/step - loss: 2.3514 202/202 [==============================] - 60s 294ms/step - loss: 2.3114 202/202 [==============================] - 34s 170ms/step - loss: 2.2728 202/202 [==============================] - 44s 217ms/step - loss: 2.2399 202/202 [==============================] - 49s 244ms/step - loss: 2.2051 202/202 [==============================] - 65s 324ms/step - loss: 2.1774
# save the model
model.save(WORKING_DIR+'/best_model.h5')
def idx_to_word(integer, tokenizer):
for word, index in tokenizer.word_index.items():
if index == integer:
return word
return None
# generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
# add start tag for generation process
in_text = 'startseq'
# iterate over the max length of sequence
for i in range(max_length):
# encode input sequence
sequence = tokenizer.texts_to_sequences([in_text])[0]
# pad the sequence
sequence = pad_sequences([sequence], max_length)
# predict next word
yhat = model.predict([image, sequence], verbose=0)
# get index with high probability
yhat = np.argmax(yhat)
# convert index to word
word = idx_to_word(yhat, tokenizer)
# stop if word not found
if word is None:
break
# append word as input for generating next word
in_text += " " + word
# stop if we reach end tag
if word == 'endseq':
break
return in_text
from nltk.translate.bleu_score import corpus_bleu
# validate with test data
actual, predicted = list(), list()
for key in tqdm(test):
# get actual caption
captions = mapping[key]
# predict the caption for image
y_pred = predict_caption(model, features[key], tokenizer, max_length)
# split into words
actual_captions = [caption.split() for caption in captions]
y_pred = y_pred.split()
# append to the list
actual.append(actual_captions)
predicted.append(y_pred)
# calcuate BLEU score
print("BLEU-1: %f" % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
print("BLEU-2: %f" % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
0%| | 0/1619 [00:00<?, ?it/s]
BLEU-1: 0.537607 BLEU-2: 0.309126 BLEU-2: 0.309126
from PIL import Image
import matplotlib.pyplot as plt
def generate_caption(image_name):
# load the image
# image_name = "1001773457_577c3a7d70.jpg"
image_id = image_name.split('.')[0]
img_path = os.path.join(BASE_DIR, "Images", image_name)
image = Image.open(img_path)
captions = mapping[image_id]
print('---------------------Actual---------------------')
for caption in captions:
print(caption)
# predict the caption
y_pred = predict_caption(model, features[image_id], tokenizer, max_length)
print('--------------------Predicted--------------------')
print(y_pred)
plt.imshow(image)
generate_caption("1001773457_577c3a7d70.jpg")
---------------------Actual--------------------- startseq black dog and spotted dog are fighting endseq startseq black dog and tri-colored dog playing with each other on the road endseq startseq black dog and white dog with brown spots are staring at each other in the street endseq startseq two dogs of different breeds looking at each other on the road endseq startseq two dogs on pavement moving toward each other endseq --------------------Predicted-------------------- startseq two dogs are playing with toy in the grass endseq
generate_caption("1002674143_1b742ab4b8.jpg")
---------------------Actual--------------------- startseq little girl covered in paint sits in front of painted rainbow with her hands in bowl endseq startseq little girl is sitting in front of large painted rainbow endseq startseq small girl in the grass plays with fingerpaints in front of white canvas with rainbow on it endseq startseq there is girl with pigtails sitting in front of rainbow painting endseq startseq young girl with pigtails painting outside in the grass endseq --------------------Predicted-------------------- startseq woman in purple dress is sitting in the grass with rainbow painting in the background endseq
generate_caption("101669240_b2d3e7f17b.jpg")
---------------------Actual--------------------- startseq man in hat is displaying pictures next to skier in blue hat endseq startseq man skis past another man displaying paintings in the snow endseq startseq person wearing skis looking at framed pictures set up in the snow endseq startseq skier looks at framed pictures in the snow next to trees endseq startseq man on skis looking at artwork for sale in the snow endseq --------------------Predicted-------------------- startseq skier in blue coat is displaying pictures in the snow endseq
generate_caption("49553964_cee950f3ba.jpg")
---------------------Actual--------------------- startseq man holding onto ropes while boogie boarding endseq startseq man holds onto ropes and is pulled through the water on his ski endseq startseq man rides wakeboard attached to parachute endseq startseq man windsurfing endseq startseq the man is waterskiing endseq --------------------Predicted-------------------- startseq person in yellow and blue is riding on the water with buildings in the background endseq
model.predict
<bound method Model.predict of <keras.engine.functional.Functional object at 0x0000029BF5C4DDE0>>
generate_caption("49553964_cee950f3ba.jpg")
---------------------Actual--------------------- startseq man holding onto ropes while boogie boarding endseq startseq man holds onto ropes and is pulled through the water on his ski endseq startseq man rides wakeboard attached to parachute endseq startseq man windsurfing endseq startseq the man is waterskiing endseq --------------------Predicted-------------------- startseq person in yellow and blue is riding on the water with buildings in the background endseq
import cv2
import os
BASE_DIR = 'E:\Masters\Image_captioning'
def blur_flickr8k_images(src_dir, dst_dir):
"""Blur images in Flickr8k dataset
Args:
src_dir (str): Directory containing original images
dst_dir (str): Directory to save blurred images
"""
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
for img_file in os.listdir(src_dir):
img_path = os.path.join(src_dir, img_file)
img = cv2.imread(img_path)
blurred = cv2.GaussianBlur(img, (51,51), 0)
blur_path = os.path.join(dst_dir, img_file)
cv2.imwrite(blur_path, blurred)
from PIL import Image
import matplotlib.pyplot as plt
BASE_DIR_cust = 'E:\Masters\Image_captioning'
def generate_caption_cust(image_name):
# load the image
# image_name = "1001773457_577c3a7d70.jpg"
image_id = image_name.split('.')[0]
img_path = os.path.join(BASE_DIR_cust, "Cust", image_name)
print(img_path)
image = Image.open(img_path)
plt.show(image)
captions = mapping[image_id]
print('---------------------Actual---------------------')
for caption in captions:
print(caption)
# predict the caption
y_pred = predict_caption(model, features[image_id], tokenizer, max_length)
print('--------------------Predicted--------------------')
print(y_pred)
plt.imshow(image)
from PIL import Image
from skimage import transform
import numpy as np
import matplotlib.pyplot as plt
import skimage.io as io
import cv2
import skimage
def read_image(img):
img_path = os.path.join(BASE_DIR, "Images", img)
image = cv2.imread(img_path)
return image
def perform_test(test):
threshold(test)
generate_caption_cust(test)
restoration_resolution(test)
generate_caption_cust(test)
negative(test)
generate_caption(test)
return
def write(img, img_nam):
output_path = os.path.join(BASE_DIR_cust, "Cust", img_nam)
cv2.imwrite(output_path, img)
#Test - 1, thresholding the image.
def threshold(image_):
image = read_image(image_)
blue_channel, green_channel, red_channel = cv2.split(image)
threshold_value = 128
_, blue_thresholded = cv2.threshold(blue_channel, threshold_value, 255, cv2.THRESH_BINARY)
_, green_thresholded = cv2.threshold(green_channel, threshold_value, 255, cv2.THRESH_BINARY)
_, red_thresholded = cv2.threshold(red_channel, threshold_value, 255, cv2.THRESH_BINARY)
combined_thresholded_image = cv2.merge([blue_thresholded, green_thresholded, red_thresholded])
write(combined_thresholded_image, image_)
#Test - 2, Restoration
def restoration_resolution(img_):
img = read_image(img_)
new_resolution = (12, 12)
resized_image = cv2.resize(img, new_resolution)
restored_image = cv2.resize(resized_image, (img.shape[1], img.shape[0]))
write(restored_image, img_)
#Test - 3, Negative.
def negative(img_):
img = read_image(img_)
col_neg = abs(255-img)
write(col_neg, img_)
name = "{}.jpg".format(test[0])
perform_test(name)
3601569729_bf4bf82768.jpg E:\Masters\Image_captioning\Cust\3601569729_bf4bf82768.jpg ---------------------Actual--------------------- startseq group of race horses run down track carrying jockeys endseq startseq horse race endseq startseq jockeys on horses during race endseq startseq the horses race on the dirt track while their riders urge them on endseq startseq "there are riders and horses in horse race going around track ." endseq --------------------Predicted-------------------- startseq pack of horses and horses in the dirt endseq E:\Masters\Image_captioning\Cust\3601569729_bf4bf82768.jpg
---------------------Actual--------------------- startseq group of race horses run down track carrying jockeys endseq startseq horse race endseq startseq jockeys on horses during race endseq startseq the horses race on the dirt track while their riders urge them on endseq startseq "there are riders and horses in horse race going around track ." endseq --------------------Predicted-------------------- startseq pack of horses and horses in the dirt endseq ---------------------Actual--------------------- startseq group of race horses run down track carrying jockeys endseq startseq horse race endseq startseq jockeys on horses during race endseq startseq the horses race on the dirt track while their riders urge them on endseq startseq "there are riders and horses in horse race going around track ." endseq --------------------Predicted-------------------- startseq pack of horses and horses in the dirt endseq
3601569729_bf4bf82768